Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ authors = ["NVIDIA Carbide Engineering <carbide-dev@exchange.nvidia.com>"]
[workspace.dependencies]
clap = { version = "4", features = ["derive", "env"] }
libredfish = { git = "https://github.com/NVIDIA/libredfish.git", tag = "v0.44.4" }
librms = { git = "https://github.com/NVIDIA/nv-rms-client.git", tag = "v0.0.12-rc3" }
librms = { git = "https://github.com/NVIDIA/nv-rms-client.git", tag = "v0.0.12-rc4" }
ansi-to-html = "0.2.2"

tokio = { version = "1", features = ["full", "tracing"] }
Expand Down
16 changes: 16 additions & 0 deletions crates/api-test-helper/src/mock_rms.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ pub struct MockRmsApi {
Mutex<VecDeque<Result<rms::GetPowerStateResponse, RackManagerError>>>,
get_power_state_calls: Mutex<Vec<rms::GetPowerStateRequest>>,

get_power_state_by_device_list_responses:
Mutex<VecDeque<Result<rms::GetPowerStateByDeviceListResponse, RackManagerError>>>,
get_power_state_by_device_list_calls: Mutex<Vec<rms::GetPowerStateByDeviceListRequest>>,

sequence_rack_power_responses:
Mutex<VecDeque<Result<rms::SequenceRackPowerResponse, RackManagerError>>>,
sequence_rack_power_calls: Mutex<Vec<rms::SequenceRackPowerRequest>>,
Expand Down Expand Up @@ -244,6 +248,8 @@ impl MockRmsApi {
set_power_state_calls: Default::default(),
set_power_state_by_device_list_responses: Default::default(),
set_power_state_by_device_list_calls: Default::default(),
get_power_state_by_device_list_responses: Default::default(),
get_power_state_by_device_list_calls: Default::default(),
get_power_state_responses: Default::default(),
get_power_state_calls: Default::default(),
sequence_rack_power_responses: Default::default(),
Expand Down Expand Up @@ -809,6 +815,16 @@ fn pop_or_err<T>(

#[async_trait::async_trait]
impl RmsApi for MockRmsApi {
async fn get_power_state_by_device_list(
&self,
cmd: rms::GetPowerStateByDeviceListRequest,
) -> Result<rms::GetPowerStateByDeviceListResponse, RackManagerError> {
self.get_power_state_by_device_list_calls
.lock()
.await
.push(cmd);
pop_or_err(&mut self.get_power_state_by_device_list_responses.lock().await)
}
async fn set_power_state(
&self,
cmd: rms::SetPowerStateRequest,
Expand Down
11 changes: 6 additions & 5 deletions crates/power-shelf-controller/src/maintenance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,11 +245,12 @@ async fn invoke_rms_power_operation(
}

/// Build the `rms::NewNodeInfo` describing this power shelf for inclusion
/// in a `SetPowerStateByDeviceList` request. Resolves the BMC IP from the
/// database and BMC credentials via the credential manager, since the
/// caller-supplied variant of the RPC requires the BMC connection details
/// inline rather than relying on RMS's inventory.
async fn build_power_shelf_node_info(
/// in any caller-supplied `NodeSet` request (`SetPowerStateByDeviceList`
/// from `Maintenance`, `GetDeviceInfoByDeviceList` from `Ready`). Resolves
/// the BMC IP from the database and BMC credentials via the credential
/// manager, since these RPCs require the BMC connection details inline
/// rather than relying on RMS's inventory.
pub(super) async fn build_power_shelf_node_info(
power_shelf_id: &PowerShelfId,
state: &PowerShelf,
rack_id: String,
Expand Down
206 changes: 198 additions & 8 deletions crates/power-shelf-controller/src/ready.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,33 @@

//! Handler for PowerShelfControllerState::Ready.

use carbide_rack::rack_manager_error;
use carbide_uuid::power_shelf::PowerShelfId;
use model::power_shelf::{PowerShelf, PowerShelfControllerState};
use db::power_shelf as db_power_shelf;
use librms::protos::rack_manager as rms;
use model::power_shelf::{PowerShelf, PowerShelfControllerState, PowerShelfStatus};
use sqlx::PgTransaction;
use state_controller::state_handler::{
StateHandlerContext, StateHandlerError, StateHandlerOutcome,
};

use crate::context::PowerShelfStateHandlerContextObjects;
use crate::maintenance::build_power_shelf_node_info;

/// Handles the Ready state for a power shelf.
///
/// If the power shelf is marked for deletion, transitions to `Deleting`.
/// If a maintenance request has been posted via
/// `power_shelf_maintenance_requested`, transitions to `Maintenance` with the
/// requested operation (PowerOn / PowerOff). Otherwise idles.
/// requested operation (PowerOn / PowerOff). Otherwise polls RMS for the
/// current power state (best-effort observation) and idles.
///
/// TODO: Implement PowerShelf monitoring (health checks, status updates,
/// power consumption / efficiency tracking).
pub async fn handle_ready(
power_shelf_id: &PowerShelfId,
state: &mut PowerShelf,
_ctx: &mut StateHandlerContext<'_, PowerShelfStateHandlerContextObjects>,
ctx: &mut StateHandlerContext<'_, PowerShelfStateHandlerContextObjects>,
) -> Result<StateHandlerOutcome<PowerShelfControllerState>, StateHandlerError> {
if state.is_marked_as_deleted() {
return Ok(StateHandlerOutcome::transition(
Expand All @@ -58,9 +64,193 @@ pub async fn handle_ready(
));
}

tracing::info!("PowerShelf {} is ready", power_shelf_id,);
Ok(StateHandlerOutcome::wait(format!(
"PowerShelf {} is ready",
power_shelf_id
)))
let txn = poll_rms_power_state(power_shelf_id, state, ctx).await;

Ok(StateHandlerOutcome::do_nothing().with_txn_opt(txn))
}
///
/// On a successful response, the observed `pstate` for this power shelf is
/// persisted to the `power_shelves.status` column and the in-memory `state`
/// is updated to match. The returned `PgTransaction` (if any) carries that
/// status write so the caller can attach it to the `Ready` outcome and have
/// the state-controller framework commit it alongside the usual outcome
/// bookkeeping.
async fn poll_rms_power_state(
power_shelf_id: &PowerShelfId,
state: &mut PowerShelf,
ctx: &mut StateHandlerContext<'_, PowerShelfStateHandlerContextObjects>,
) -> Option<PgTransaction<'static>> {
let Some(rms_client) = ctx.services.rms_client.as_ref() else {
tracing::debug!(
power_shelf_id = %power_shelf_id,
"PowerShelf Ready: skipping RMS GetPowerStateByDeviceList; RMS client not configured",
);
return None;
};

let Some(rack_id) = state.rack_id.as_ref() else {
tracing::debug!(
power_shelf_id = %power_shelf_id,
"PowerShelf Ready: skipping RMS GetPowerStateByDeviceList; power shelf has no rack association",
);
return None;
};

let device = match build_power_shelf_node_info(
power_shelf_id,
state,
rack_id.to_string(),
&ctx.services.db_pool,
ctx.services.credential_manager.as_ref(),
)
.await
{
Ok(device) => device,
Err(cause) => {
tracing::debug!(
power_shelf_id = %power_shelf_id,
rack_id = %rack_id,
cause = %cause,
"PowerShelf Ready: skipping RMS GetPowerStateByDeviceList; unable to build NodeSet",
);
return None;
}
};

let request = rms::GetPowerStateByDeviceListRequest {
nodes: Some(rms::NodeSet {
devices: vec![device],
}),
..Default::default()
};

let rack_id_str = rack_id.to_string();
let response = match rms_client.get_power_state_by_device_list(request).await {
Ok(response) => response,
Err(error) => {
let error = rack_manager_error("get_power_state_by_device_list", error);
tracing::warn!(
power_shelf_id = %power_shelf_id,
rack_id = %rack_id_str,
error = %error,
"RMS GetPowerStateByDeviceList transport error",
);
return None;
}
};

let batch = response.response.clone().unwrap_or_default();
if !(batch.status == rms::ReturnCode::Success as i32 && batch.failed_nodes == 0) {
tracing::warn!(
power_shelf_id = %power_shelf_id,
rack_id = %rack_id_str,
batch_status = batch.status,
successful_nodes = batch.successful_nodes,
failed_nodes = batch.failed_nodes,
message = %batch.message,
"RMS GetPowerStateByDeviceList returned non-Success result",
);
return None;
Comment thread
vinodchitraliNVIDIA marked this conversation as resolved.
}

tracing::info!(
power_shelf_id = %power_shelf_id,
rack_id = %rack_id_str,
successful_nodes = batch.successful_nodes,
pstates = ?response
.node_power_states
.iter()
.map(|node| (node.node_id.as_str(), node.pstate.as_str()))
.collect::<Vec<_>>(),
"RMS GetPowerStateByDeviceList succeeded",
);

persist_observed_power_state(power_shelf_id, state, ctx, &response.node_power_states).await
}

/// Look up the `NodePowerState` for this power shelf in the RMS response,
/// stamp the value into `state.status`, and persist it via
/// `db_power_shelf::update`. Returns the open `PgTransaction` so the caller
/// can attach it to the `Ready` outcome.
///
/// Status persistence is best-effort: if RMS did not echo a result for this
/// node, or if the DB write fails, the in-memory state is left untouched
/// and `None` is returned — `Ready` must stay in `Ready` regardless.
async fn persist_observed_power_state(
power_shelf_id: &PowerShelfId,
state: &mut PowerShelf,
ctx: &mut StateHandlerContext<'_, PowerShelfStateHandlerContextObjects>,
node_power_states: &[rms::NodePowerState],
) -> Option<PgTransaction<'static>> {
let node_id = power_shelf_id.to_string();
let Some(observed) = node_power_states
.iter()
.find(|node| node.node_id == node_id)
else {
tracing::debug!(
power_shelf_id = %power_shelf_id,
"RMS GetPowerStateByDeviceList: no NodePowerState echoed for this power shelf; skipping status update",
);
return None;
};

let new_power_state = observed.pstate.to_lowercase();
let new_status = match state.status.as_ref() {
Some(existing) => PowerShelfStatus {
shelf_name: existing.shelf_name.clone(),
power_state: new_power_state.clone(),
health_status: existing.health_status.clone(),
},
None => PowerShelfStatus {
shelf_name: state.config.name.clone(),
power_state: new_power_state.clone(),
health_status: String::new(),
},
};

if state
.status
.as_ref()
.is_some_and(|s| s.power_state == new_status.power_state)
{
tracing::debug!(
power_shelf_id = %power_shelf_id,
power_state = %new_status.power_state,
"PowerShelf status power_state unchanged; skipping DB write",
);
return None;
}

let previous_status = state.status.replace(new_status);

let mut txn = match ctx.services.db_pool.begin().await {
Ok(txn) => txn,
Err(error) => {
state.status = previous_status;
tracing::warn!(
power_shelf_id = %power_shelf_id,
error = %error,
"PowerShelf Ready: failed to begin txn while persisting observed power state",
);
return None;
}
};

if let Err(error) = db_power_shelf::update(state, &mut txn).await {
Comment thread
vinodchitraliNVIDIA marked this conversation as resolved.
state.status = previous_status;
tracing::warn!(
power_shelf_id = %power_shelf_id,
error = %error,
"PowerShelf Ready: failed to persist observed power state to DB",
);
return None;
}

tracing::info!(
power_shelf_id = %power_shelf_id,
power_state = %new_power_state,
"PowerShelf Ready: persisted observed power state from RMS",
);

Some(txn)
}
23 changes: 23 additions & 0 deletions crates/rack/src/rms_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,10 @@ pub mod test_support {

fn build_mock_client(&self) -> MockRmsClient {
MockRmsClient {
submitted_get_power_state_by_device_list_requests: Arc::new(Mutex::new(Vec::new())),
queued_get_power_state_by_device_list_responses: Arc::new(Mutex::new(
VecDeque::new(),
)),
fail_add_node: self.fail_add_node.clone(),
fail_inventory_get: self.fail_inventory_get.clone(),
registered_nodes: self.registered_nodes.clone(),
Expand Down Expand Up @@ -452,6 +456,10 @@ pub mod test_support {
switch_system_image_job_statuses:
Arc<Mutex<HashMap<String, rms::GetSwitchSystemImageJobStatusResponse>>>,
switch_system_image_job_errors: Arc<Mutex<HashMap<String, String>>>,
submitted_get_power_state_by_device_list_requests:
Arc<Mutex<Vec<rms::GetPowerStateByDeviceListRequest>>>,
queued_get_power_state_by_device_list_responses:
Arc<Mutex<VecDeque<Result<rms::GetPowerStateByDeviceListResponse, RackManagerError>>>>,
submitted_get_device_info_by_device_list_requests:
Arc<Mutex<Vec<rms::GetDeviceInfoByDeviceListRequest>>>,
queued_get_device_info_by_device_list_responses:
Expand All @@ -473,6 +481,21 @@ pub mod test_support {

#[async_trait::async_trait]
impl RmsApi for MockRmsClient {
async fn get_power_state_by_device_list(
&self,
cmd: rms::GetPowerStateByDeviceListRequest,
) -> Result<rms::GetPowerStateByDeviceListResponse, RackManagerError> {
self.submitted_get_power_state_by_device_list_requests
.lock()
.await
.push(cmd);
self.queued_get_power_state_by_device_list_responses
.lock()
.await
.pop_front()
.unwrap_or(Ok(rms::GetPowerStateByDeviceListResponse::default()))
}

async fn get_device_info_by_device_list(
&self,
cmd: rms::GetDeviceInfoByDeviceListRequest,
Expand Down
Loading