Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 71 additions & 35 deletions rs/recovery/src/admin_helper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,35 +81,6 @@ impl AdminHelper {
}
}

pub fn get_halt_subnet_command(
&self,
subnet_id: SubnetId,
is_halted: bool,
keys: &[String],
) -> IcAdmin {
let mut ic_admin = self.get_ic_admin_cmd_base();

ic_admin
.add_positional_argument("propose-to-update-subnet")
.add_argument("subnet", subnet_id)
.add_argument("is-halted", is_halted);
if !keys.is_empty() {
ic_admin.add_arguments(SSH_READONLY_ACCESS_ARG, keys.iter().map(quote));
}
ic_admin.add_argument(
SUMMARY_ARG,
quote(format!(
"{} subnet {}, for recovery and update ssh readonly access",
if is_halted { "Halt" } else { "Unhalt" },
subnet_id,
)),
);

self.add_proposer_args(&mut ic_admin);

ic_admin
}

pub fn get_propose_to_take_subnet_offline_for_repairs_command(
&self,
subnet_id: SubnetId,
Expand Down Expand Up @@ -423,24 +394,89 @@ mod tests {
const FAKE_REPLICA_VERSION: &str = "fake_replica_version";

#[test]
fn get_halt_subnet_command_test() {
fn get_propose_to_take_subnet_offline_for_repairs_without_keys_command_test() {
let result = fake_admin_helper()
.get_halt_subnet_command(
.get_propose_to_take_subnet_offline_for_repairs_command(
subnet_id_from_str(FAKE_SUBNET_ID_1),
&[],
&BTreeMap::new(),
)
.join(" ");

assert_eq!(
result,
"/fake/ic/admin/dir/ic-admin \
--nns-url \"https://fake_nns_url.com:8080/\" \
propose-to-take-subnet-offline-for-repairs \
--subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe \
--summary \"Take subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe offline for recovery, updating readonly and write access ssh keys\" \
--test-neuron-proposer"
);
}

#[test]
fn get_propose_to_take_subnet_offline_for_repairs_with_readonly_key_command_test() {
let result = fake_admin_helper()
.get_propose_to_take_subnet_offline_for_repairs_command(
subnet_id_from_str(FAKE_SUBNET_ID_1),
&["fake public key".to_string()],
&BTreeMap::new(),
)
.join(" ");

assert_eq!(
result,
"/fake/ic/admin/dir/ic-admin \
--nns-url \"https://fake_nns_url.com:8080/\" \
propose-to-take-subnet-offline-for-repairs \
--subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe \
--ssh-readonly-access \"fake public key\" \
--summary \"Take subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe offline for recovery, updating readonly and write access ssh keys\" \
--test-neuron-proposer"
);
}

#[test]
fn get_propose_to_take_subnet_offline_for_repairs_with_readonly_and_write_keys_command_test() {
let result = fake_admin_helper()
.get_propose_to_take_subnet_offline_for_repairs_command(
subnet_id_from_str(FAKE_SUBNET_ID_1),
/*is_halted=*/ true,
&["fake public key".to_string()],
&BTreeMap::from([(
node_id_from_str(FAKE_NODE_ID),
vec!["fake write access public key".to_string()],
)]),
)
.join(" ");

assert_eq!(
result,
"/fake/ic/admin/dir/ic-admin \
--nns-url \"https://fake_nns_url.com:8080/\" \
propose-to-update-subnet \
propose-to-take-subnet-offline-for-repairs \
--subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe \
--is-halted true \
--ssh-readonly-access \"fake public key\" \
--summary \"Halt subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe, for recovery and update ssh readonly access\" \
--ssh-node-state-write-access \"nqpqw-cp42a-rmdsx-fpui3-ncne5-kzq6o-m67an-w25cx-zu636-lcf2v-fqe:fake write access public key\" \
--summary \"Take subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe offline for recovery, updating readonly and write access ssh keys\" \
--test-neuron-proposer"
);
}

#[test]
fn get_propose_to_bring_subnet_back_online_after_repairs_command_test() {
let result = fake_admin_helper()
.get_propose_to_bring_subnet_back_online_after_repairs_command(subnet_id_from_str(
FAKE_SUBNET_ID_1,
))
.join(" ");

assert_eq!(
result,
"/fake/ic/admin/dir/ic-admin \
--nns-url \"https://fake_nns_url.com:8080/\" \
propose-to-bring-subnet-back-online-after-repairs \
--subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe \
--summary \"Bring subnet gpvux-2ejnk-3hgmh-cegwf-iekfc-b7rzs-hrvep-5euo2-3ywz3-k3hcb-cqe back online after repairs\" \
--test-neuron-proposer"
);
}
Expand Down
42 changes: 14 additions & 28 deletions rs/recovery/src/app_subnet_recovery.rs
Original file line number Diff line number Diff line change
Expand Up @@ -401,20 +401,18 @@ impl RecoveryIterator<StepType, StepTypeIter> for AppSubnetRecovery {
vec![]
};

if let Some((node_id, pub_key)) = self.params.write_node_id_and_pub_key.clone() {
Ok(Box::new(self.recovery.take_subnet_offline_for_repairs(
Ok(Box::new(
self.recovery.take_subnet_offline_for_repairs(
self.params.subnet_id,
&subnet_readonly_keys,
&BTreeMap::from([(node_id, vec![pub_key])]),
)))
} else {
// TODO (CON-1637): Remove this branch and only use `take_subnet_offline_for_repairs`
Ok(Box::new(self.recovery.halt_subnet(
self.params.subnet_id,
true,
&subnet_readonly_keys,
)))
}
&self
.params
.write_node_id_and_pub_key
.clone()
.map(|(node_id, pub_key)| BTreeMap::from([(node_id, vec![pub_key])]))
.unwrap_or_default(),
),
))
}

StepType::DownloadCertifications => {
Expand Down Expand Up @@ -580,22 +578,10 @@ impl RecoveryIterator<StepType, StepTypeIter> for AppSubnetRecovery {
}
}

StepType::Unhalt => {
if self.params.write_node_id_and_pub_key.is_some() {
Ok(Box::new(
self.recovery
.bring_subnet_back_online_after_repairs(self.params.subnet_id),
))
} else {
// TODO (CON-1637): Remove this branch and only use
// `bring_subnet_back_online_after_repairs`
Ok(Box::new(self.recovery.halt_subnet(
self.params.subnet_id,
false,
&["".to_string()],
)))
}
}
StepType::Unhalt => Ok(Box::new(
self.recovery
.bring_subnet_back_online_after_repairs(self.params.subnet_id),
)),

StepType::Cleanup => Ok(Box::new(self.recovery.get_cleanup_step())),
}
Expand Down
15 changes: 0 additions & 15 deletions rs/recovery/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,21 +255,6 @@ impl Recovery {
Ok(max_height)
}

/// Return a recovery [AdminStep] to halt or unhalt the given subnet
pub fn halt_subnet(
&self,
subnet_id: SubnetId,
is_halted: bool,
keys: &[String],
) -> impl Step + use<> {
AdminStep {
logger: self.logger.clone(),
ic_admin_cmd: self
.admin_helper
.get_halt_subnet_command(subnet_id, is_halted, keys),
}
}

/// Return a recovery [AdminStep] to take the given subnet offline for repairs
pub fn take_subnet_offline_for_repairs(
&self,
Expand Down
7 changes: 2 additions & 5 deletions rs/recovery/subnet_splitting/src/subnet_splitting.rs
Original file line number Diff line number Diff line change
Expand Up @@ -307,11 +307,8 @@ impl SubnetSplitting {
}

fn unhalt(&self, target_subnet: TargetSubnet) -> impl Step + use<> {
self.recovery.halt_subnet(
self.subnet_id(target_subnet),
/*is_halted=*/ false,
/*keys=*/ &[],
)
self.recovery
.bring_subnet_back_online_after_repairs(self.subnet_id(target_subnet))
}

fn propose_cup(&self, target_subnet: TargetSubnet) -> RecoveryResult<impl Step + use<>> {
Expand Down
18 changes: 8 additions & 10 deletions rs/registry/admin/bin/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ enum SubCommand {
///
/// This is the first step of subnet recovery. Previously the first step was
/// done using propose-to-update-subnet. However, that does not support
/// changing ssn_node_state_write_access, which is needed when a subnet has
/// changing ssh_node_state_write_access, which is needed when a subnet has
/// SEV enabled everywhere (or the subnet has no DFINITY node).
///
/// At the end of subnet recovery, run propose-to-bring-subnet-back-online.
Expand Down Expand Up @@ -835,7 +835,7 @@ struct ProposeToTakeSubnetOfflineForRepairsCmd {
/// This usually isn't a problem, because the list is typically empty when
/// this proposal is used as part of subnet recovery.
#[clap(long, num_args(1..))]
pub ssh_readonly_access: Vec<String>,
pub ssh_readonly_access: Option<Vec<String>>,

/// Similar to the --ssh-readonly-access flag, but there are some important
/// differences:
Expand All @@ -858,7 +858,7 @@ struct ProposeToTakeSubnetOfflineForRepairsCmd {
/// clobber. However, in practice, it would probably be empty to begin with,
/// so most likely, this won't be an issue.
#[clap(long, value_parser, num_args(1..))]
pub ssh_node_state_write_access: Vec<NodeSshAccessFlagValue>,
pub ssh_node_state_write_access: Option<Vec<NodeSshAccessFlagValue>>,

/// List of replica version IDs to recall. These versions will be marked as
/// recalled for this subnet, preventing them from being upgraded to them. If the
Expand Down Expand Up @@ -895,15 +895,13 @@ impl ProposalPayload<SetSubnetOperationalLevelPayload> for ProposeToTakeSubnetOf
let ssh_node_state_write_access = self
.ssh_node_state_write_access
.clone()
.into_iter()
.map(NodeSshAccess::from)
.collect();
.map(|keys| keys.into_iter().map(NodeSshAccess::from).collect());

SetSubnetOperationalLevelPayload {
subnet_id: Some(SubnetId::from(self.subnet)),
operational_level: Some(operational_level::DOWN_FOR_REPAIRS),
ssh_readonly_access: Some(self.ssh_readonly_access.clone()),
ssh_node_state_write_access: Some(ssh_node_state_write_access),
ssh_readonly_access: self.ssh_readonly_access.clone(),
ssh_node_state_write_access,
recalled_replica_version_ids,
}
}
Expand All @@ -924,14 +922,14 @@ impl FromStr for NodeSshAccessFlagValue {
let node_id = parts.next().ok_or("Missing node ID.")?;
let public_keys = parts
.next()
.ok_or("Missing semicolon separating node ID and SSH public key.")?
.ok_or("Missing semicolon separating node ID and SSH public keys.")?
.to_string();

// Parse node_id.
let node_id = PrincipalId::from_str(node_id).map_err(|err| format!("{err}"))?;
let node_id = NodeId::from(node_id);

// Parse public_keys, by simply splitting on ','.<
// Parse public_keys, by simply splitting on ','.
let public_keys = public_keys
.split(',')
.map(|public_key| public_key.to_string())
Expand Down
10 changes: 2 additions & 8 deletions rs/tests/consensus/subnet_recovery/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -560,13 +560,7 @@ fn app_subnet_recovery_test(env: TestEnv, cfg: TestConfig) {
let f = (cfg.subnet_size - 1) / 3;
break_nodes(&app_nodes.take(f + 1).collect::<Vec<_>>(), &logger);
} else {
halt_subnet(
&admin_helper,
&download_state_node,
app_subnet_id,
&[],
&logger,
)
halt_subnet(&admin_helper, &download_state_node, app_subnet_id, &logger);
}
assert_subnet_is_broken(
&download_state_node.get_public_url(),
Expand Down Expand Up @@ -1010,7 +1004,7 @@ fn corrupt_latest_cup(
"Did not find log entry that CUP is corrupted"
);

unhalt_subnet(admin_helper, subnet.subnet_id, &[], logger);
unhalt_subnet(admin_helper, subnet.subnet_id, logger);
}

/// Print ID and IP of the source subnet, the first app subnet found that is not the source, and all
Expand Down
19 changes: 9 additions & 10 deletions rs/tests/consensus/subnet_recovery/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use ic_system_test_driver::{
};
use ic_types::SubnetId;
use slog::{Logger, info};
use std::{fmt::Debug, path::PathBuf};
use std::{collections::BTreeMap, fmt::Debug, path::PathBuf};
use url::Url;

pub const READONLY_USERNAME: &str = "readonly";
Expand Down Expand Up @@ -86,7 +86,6 @@ pub fn halt_subnet(
admin_helper: &AdminHelper,
subnet_node: &IcNodeSnapshot,
subnet_id: SubnetId,
keys: &[String],
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed this argument just for consistency with the signature of unhalt_subnet because it was not used anyways. If we'd ever want to set some keys while halting the subnet, we could re-introduce the argument at that moment.

logger: &Logger,
) {
info!(logger, "Halting subnet {subnet_id}...");
Expand All @@ -96,7 +95,11 @@ pub fn halt_subnet(

AdminStep {
logger: logger.clone(),
ic_admin_cmd: admin_helper.get_halt_subnet_command(subnet_id, true, keys),
ic_admin_cmd: admin_helper.get_propose_to_take_subnet_offline_for_repairs_command(
subnet_id,
&[],
&BTreeMap::new(),
),
}
.exec()
.expect("Failed to halt subnet");
Expand All @@ -115,16 +118,12 @@ pub fn halt_subnet(

/// Unhalt the given subnet by executing the corresponding ic-admin command through the given NNS
/// URL.
pub fn unhalt_subnet(
admin_helper: &AdminHelper,
subnet_id: SubnetId,
keys: &[String],
logger: &Logger,
) {
pub fn unhalt_subnet(admin_helper: &AdminHelper, subnet_id: SubnetId, logger: &Logger) {
info!(logger, "Unhalting subnet {subnet_id}...");
AdminStep {
logger: logger.clone(),
ic_admin_cmd: admin_helper.get_halt_subnet_command(subnet_id, false, keys),
ic_admin_cmd: admin_helper
.get_propose_to_bring_subnet_back_online_after_repairs_command(subnet_id),
}
.exec()
.expect("Failed to unhalt subnet");
Expand Down
Loading