diff --git a/Cargo.lock b/Cargo.lock index 69cc77e8eb18..bfdffc913765 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -21412,7 +21412,11 @@ dependencies = [ "chrono", "dfn_candid", "futures", + "ic-agent 0.40.1", "ic-system-test-driver", + "ic-types", + "ic-universal-canister", + "ic-utils 0.40.1", "slog", "tokio", ] @@ -23546,6 +23550,7 @@ version = "0.9.0" dependencies = [ "candid", "canister-test", + "futures", "ic-cdk", "ic-management-canister-types-private", "ic-state-machine-tests", diff --git a/rs/execution_environment/Contributing.md b/rs/execution_environment/Contributing.md index b746a9d63b02..88a9c6e932ae 100644 --- a/rs/execution_environment/Contributing.md +++ b/rs/execution_environment/Contributing.md @@ -41,7 +41,6 @@ The public API of the Management Canister is defined in Candid. Candid is design 8. Write tests to cover the new or updated functionality: - Use the `ExecutionTest` framework by default. - Use the `StateMachine` framework if the feature involves inter-canister calls, canister HTTPS outcalls, threshold signatures, or checkpointing. These require mocked Consensus layer outputs or a full state manager. - - Make sure to add new scenarios to `rs/execution_environment/tests/memory_matrix.rs` if your change introduces new code paths dealing with canister memory usage/allocation. 9. Once the *Interface Specification* change has been agreed on, the public Management Canister [types](https://crates.io/crates/ic-management-canister-types), [Motoko](https://github.com/dfinity/motoko), and [Rust CDK](https://github.com/dfinity/cdk-rs) can be updated to use the new API on a feature branch. Coordinate with *@eng-sdk* and *@eng-motoko* as needed. The new functionality is enabled for testing in PocketIC (on a PocketIC instance created with `enable_beta_features` set) by enabling the corresponding feature flags in `rs/pocket_ic_server/src/beta_features.rs`. diff --git a/rs/execution_environment/tests/memory_matrix.rs b/rs/execution_environment/tests/memory_matrix.rs index 0783fc597622..1d60463c34eb 100644 --- a/rs/execution_environment/tests/memory_matrix.rs +++ b/rs/execution_environment/tests/memory_matrix.rs @@ -7,28 +7,17 @@ It defines multiple *scenarios* and their expectations in terms of memory usage and performs multiple *runs* of every scenarios with various initial parameters. The runs ensure the following properties for every scenario: -- reserved cycles and subnet available memory are updated properly in both successful and failed executions; +- subnet available memory is updated properly in both successful and failed executions; - the execution fails if the subnet memory capacity would be exceeded; - the execution fails if the reserved cycles limit would be exceeded; - the execution fails if the canister would become frozen; - the execution fails if the canister does not have sufficient balance to reserve storage cycles; - the execution does not allocate additional memory for canisters with memory allocation. -Every memory matrix test has the following components: -- a "setup" function takes `&mut ExecutionTest` and `CanisterId` of an empty canister in that `ExecutionTest`, - performs a setup of that canister, and returns arbitrary data of choice (could also be `()` if no data are needed) - that are relayed to the "operation" function; -- an "operation" function takes `&mut ExecutionTest`, `CanisterId` of the canister set up before, and - the data produced by the "setup" function before; -- an instance of `ScenarioParams` also containing `Scenario` and `MemoryUsageChange` describing - the kind of scenario and its expectations in terms of canister memory usage change; -- an actual invokation of the matrix test suite implemented by the function `test_memory_suite`. - -The existing scenarios cover the following: +The scenarios cover the following: - growing WASM/stable memory in canister (update) entry point; - growing WASM/stable memory in canister reply/cleanup callback; - taking a canister snapshot (both growing and shrinking canister memory usage); -- taking a canister snapshot and uninstalling code atomically; - replacing a canister snapshot by a snapshot of the same size; - loading a canister snapshot (both growing and shrinking canister memory usage); - deleting a canister snapshot; diff --git a/rs/rust_canisters/statesync_test/BUILD.bazel b/rs/rust_canisters/statesync_test/BUILD.bazel index c64c0aeff96a..b3df9c29a30a 100644 --- a/rs/rust_canisters/statesync_test/BUILD.bazel +++ b/rs/rust_canisters/statesync_test/BUILD.bazel @@ -7,6 +7,7 @@ package(default_visibility = ["//visibility:public"]) DEPENDENCIES = [ # Keep sorted. "@crate_index//:candid", + "@crate_index//:futures", "@crate_index//:ic-cdk", "@crate_index//:lazy_static", "@crate_index//:rand", diff --git a/rs/rust_canisters/statesync_test/Cargo.toml b/rs/rust_canisters/statesync_test/Cargo.toml index 36c49c162dca..69066d7db2b9 100644 --- a/rs/rust_canisters/statesync_test/Cargo.toml +++ b/rs/rust_canisters/statesync_test/Cargo.toml @@ -12,6 +12,7 @@ path = "src/main.rs" [dependencies] candid = { workspace = true } +futures = { workspace = true } ic-cdk = { workspace = true } lazy_static = { workspace = true } rand = { workspace = true } diff --git a/rs/rust_canisters/statesync_test/src/main.rs b/rs/rust_canisters/statesync_test/src/main.rs index f59c23bc269c..8b99d28f6b2a 100644 --- a/rs/rust_canisters/statesync_test/src/main.rs +++ b/rs/rust_canisters/statesync_test/src/main.rs @@ -1,3 +1,7 @@ +use futures::{StreamExt, stream}; +use ic_cdk::management_canister::{ + ProvisionalCreateCanisterWithCyclesArgs, provisional_create_canister_with_cycles, +}; use ic_cdk::stable::{ WASM_PAGE_SIZE_IN_BYTES as PAGE_SIZE, stable_grow, stable_size, stable_write, }; @@ -78,4 +82,28 @@ async fn read_state(index: usize) -> Result { } } +#[update] +async fn create_many_canisters(n: u64) { + let mut futs = vec![]; + + for _ in 0..n { + let fut = async { + let create_args = ProvisionalCreateCanisterWithCyclesArgs { + amount: None, + settings: None, + specified_id: None, + }; + provisional_create_canister_with_cycles(&create_args) + .await + .expect("Failed to create canister"); + }; + futs.push(fut); + } + + stream::iter(futs) + .buffer_unordered(500) // limit concurrency to 500 (inter-canister queue capacity) + .collect::>() + .await; +} + fn main() {} diff --git a/rs/rust_canisters/statesync_test/test/test.rs b/rs/rust_canisters/statesync_test/test/test.rs index 61b89c071105..27c25566a9e1 100644 --- a/rs/rust_canisters/statesync_test/test/test.rs +++ b/rs/rust_canisters/statesync_test/test/test.rs @@ -3,24 +3,26 @@ use canister_test::*; use ic_management_canister_types_private::CanisterSettingsArgsBuilder; use ic_state_machine_tests::StateMachine; +fn deploy_state_sync_test_canister(env: &StateMachine) -> CanisterId { + let features = []; + let wasm = Project::cargo_bin_maybe_from_env("statesync-test-canister", &features); + env.install_canister( + wasm.bytes(), + vec![], + Some( + CanisterSettingsArgsBuilder::new() + .with_memory_allocation(8 * 1024 * 1024 * 1024) + .build(), + ), + ) + .expect("Failed to install canister") +} + #[test] fn test_statesync_test_canisters() { let env = StateMachine::new(); - let features = []; - let wasm = Project::cargo_bin_maybe_from_env("statesync-test-canister", &features); - let canister_id = env - .install_canister( - wasm.bytes(), - vec![], - Some( - CanisterSettingsArgsBuilder::new() - .with_memory_allocation(8 * 1024 * 1024 * 1024) - .build(), - ), - ) - .expect("Failed to install canister"); - + let canister_id = deploy_state_sync_test_canister(&env); let result = env .query(canister_id, "read_state", Encode!(&0usize).unwrap()) .unwrap(); @@ -56,6 +58,26 @@ fn test_statesync_test_canisters() { ); } +#[test] +fn test_create_many_canisters() { + let env = StateMachine::new(); + + let seed_canister_id = deploy_state_sync_test_canister(&env); + + let num_canisters: u64 = 1000; + let result = env + .execute_ingress( + seed_canister_id, + "create_many_canisters", + Encode!(&num_canisters).unwrap(), + ) + .unwrap(); + let _ = assert_reply(result); + + // We created `num_canisters` in addition to the seed canister. + assert_eq!(env.num_running_canisters(), num_canisters + 1); +} + fn assert_reply(res: WasmResult) -> Vec { match res { WasmResult::Reply(res) => res, diff --git a/rs/tests/driver/src/util.rs b/rs/tests/driver/src/util.rs index 111cbdceac21..ddca15be0b92 100644 --- a/rs/tests/driver/src/util.rs +++ b/rs/tests/driver/src/util.rs @@ -947,6 +947,8 @@ pub async fn agent_with_client_identity( .with_url(url) .with_http_client(client) .with_identity(identity) + // Setting a large polling time for the sake of long-running update calls. + .with_max_polling_time(Duration::from_secs(3600)) .with_max_concurrent_requests(MAX_CONCURRENT_REQUESTS) // Ingresses are created with the system time but are checked against the consensus time. // Consensus time is the time that is in the last finalized block. Consensus time might lag diff --git a/rs/tests/message_routing/BUILD.bazel b/rs/tests/message_routing/BUILD.bazel index d7fa93025d65..18a1168559e5 100644 --- a/rs/tests/message_routing/BUILD.bazel +++ b/rs/tests/message_routing/BUILD.bazel @@ -94,6 +94,29 @@ system_test_nns( ], ) +system_test_nns( + name = "rejoin_test_long_rounds", + enable_head_nns_variant = False, # only run this test with the mainnet NNS canisters. + env = UNIVERSAL_CANISTER_ENV | { + "STATESYNC_TEST_CANISTER_WASM_PATH": "$(rootpath //rs/rust_canisters/statesync_test:statesync-test-canister)", + }, + tags = [ + "manual", # this test does not pass currently (it serves as a baseline for future work) + "system_test_large", + ], + test_timeout = "eternal", + runtime_deps = + STATESYNC_TEST_CANISTER_RUNTIME_DEPS + + UNIVERSAL_CANISTER_RUNTIME_DEPS, + deps = [ + "//rs/registry/subnet_type", + "//rs/tests/driver:ic-system-test-driver", + "//rs/tests/message_routing/rejoin_test_lib", + "//rs/types/types", + "@crate_index//:anyhow", + ], +) + system_test_nns( name = "state_sync_malicious_chunk_test", enable_head_nns_variant = False, # only run this test with the mainnet NNS canisters. diff --git a/rs/tests/message_routing/Cargo.toml b/rs/tests/message_routing/Cargo.toml index 264ef0bb98c3..cf1218d0847b 100644 --- a/rs/tests/message_routing/Cargo.toml +++ b/rs/tests/message_routing/Cargo.toml @@ -42,6 +42,10 @@ path = "rejoin_test.rs" name = "rejoin_test_large_state" path = "rejoin_test_large_state.rs" +[[bin]] +name = "rejoin_test_long_rounds" +path = "rejoin_test_long_rounds.rs" + [[bin]] name = "state_sync_malicious_chunk_test" path = "state_sync_malicious_chunk_test.rs" diff --git a/rs/tests/message_routing/rejoin_test_lib/BUILD.bazel b/rs/tests/message_routing/rejoin_test_lib/BUILD.bazel index 8d58d305a550..3c72debbfb2e 100644 --- a/rs/tests/message_routing/rejoin_test_lib/BUILD.bazel +++ b/rs/tests/message_routing/rejoin_test_lib/BUILD.bazel @@ -11,10 +11,14 @@ rust_library( "//rs/rust_canisters/canister_test", "//rs/rust_canisters/dfn_candid", "//rs/tests/driver:ic-system-test-driver", + "//rs/types/types", + "//rs/universal_canister/lib", "@crate_index//:anyhow", "@crate_index//:candid", "@crate_index//:chrono", "@crate_index//:futures", + "@crate_index//:ic-agent", + "@crate_index//:ic-utils", "@crate_index//:slog", "@crate_index//:tokio", ], diff --git a/rs/tests/message_routing/rejoin_test_lib/Cargo.toml b/rs/tests/message_routing/rejoin_test_lib/Cargo.toml index 77db8d3e294a..93ffa63162c3 100644 --- a/rs/tests/message_routing/rejoin_test_lib/Cargo.toml +++ b/rs/tests/message_routing/rejoin_test_lib/Cargo.toml @@ -12,7 +12,11 @@ candid = { workspace = true } canister-test = { path = "../../../rust_canisters/canister_test" } chrono = { workspace = true } futures = { workspace = true } +ic-agent = { workspace = true } ic-system-test-driver = { path = "../../driver" } +ic-types = { path = "../../../types/types" } +ic-universal-canister = { path = "../../../universal_canister/lib" } +ic-utils = { workspace = true } slog = { workspace = true } tokio = { workspace = true } dfn_candid = { path = "../../../rust_canisters/dfn_candid" } diff --git a/rs/tests/message_routing/rejoin_test_lib/rejoin_test_lib.rs b/rs/tests/message_routing/rejoin_test_lib/rejoin_test_lib.rs index 4c2a7b184094..76c1b035dbe0 100644 --- a/rs/tests/message_routing/rejoin_test_lib/rejoin_test_lib.rs +++ b/rs/tests/message_routing/rejoin_test_lib/rejoin_test_lib.rs @@ -1,10 +1,17 @@ +use candid::{Encode, Principal}; use canister_test::{Canister, Runtime, Wasm}; use futures::future::join_all; +use ic_agent::Agent; use ic_system_test_driver::driver::test_env::TestEnv; use ic_system_test_driver::driver::test_env_api::get_dependency_path; use ic_system_test_driver::driver::test_env_api::retry_async; use ic_system_test_driver::driver::test_env_api::{HasPublicApiUrl, HasVm, IcNodeSnapshot}; use ic_system_test_driver::util::{MetricsFetcher, UniversalCanister, runtime_from_url}; +use ic_types::PrincipalId; +use ic_types::messages::ReplicaHealthStatus; +use ic_universal_canister::wasm; +use ic_utils::interfaces::management_canister::ManagementCanister; +use slog::Logger; use slog::info; use std::collections::BTreeMap; use std::env; @@ -214,6 +221,171 @@ pub async fn rejoin_test_large_state( assert_state_sync_has_happened(&logger, rejoin_node, base_count).await; } +async fn deploy_seed_canister( + ic00: &ManagementCanister<'_>, + effective_canister_id: PrincipalId, +) -> Principal { + let seed_canister_id = ic00 + .create_canister() + .as_provisional_create_with_amount(None) + .with_effective_canister_id(effective_canister_id.0) + .call_and_wait() + .await + .expect("Failed to create a seed canister") + .0; + let seed_canister_wasm_path = get_dependency_path( + env::var("STATESYNC_TEST_CANISTER_WASM_PATH") + .expect("STATESYNC_TEST_CANISTER_WASM_PATH not set"), + ); + let seed_canister_wasm = std::fs::read(seed_canister_wasm_path) + .expect("Could not read STATESYNC_TEST_CANISTER_WASM_PATH"); + ic00.install(&seed_canister_id, &seed_canister_wasm) + .await + .expect("Failed to install a seed canister"); + seed_canister_id +} + +async fn deploy_busy_canister(agent: &Agent, effective_canister_id: PrincipalId, logger: &Logger) { + let universal_canister = + UniversalCanister::new_with_retries(agent, effective_canister_id, logger).await; + universal_canister + .update( + wasm() + .set_heartbeat( + wasm() + .instruction_counter_is_at_least(1_800_000_000) + .build(), + ) + .reply() + .build(), + ) + .await + .expect("Failed to set up a busy canister."); +} + +pub async fn rejoin_test_long_rounds( + env: TestEnv, + num_canisters: usize, + dkg_interval: u64, + rejoin_node: IcNodeSnapshot, + agent_node: IcNodeSnapshot, +) { + let logger = env.logger(); + let agent = agent_node.build_default_agent_async().await; + let ic00 = ManagementCanister::create(&agent); + + let num_seed_canisters = 4; + info!( + logger, + "Deploying {} seed canisters on a node {} ...", + num_seed_canisters, + agent_node.get_public_url() + ); + let mut create_seed_canisters_futs = vec![]; + for _ in 0..num_seed_canisters { + create_seed_canisters_futs.push(deploy_seed_canister( + &ic00, + agent_node.effective_canister_id(), + )); + } + let seed_canisters = join_all(create_seed_canisters_futs).await; + + let num_canisters_per_seed_canister = num_canisters / num_seed_canisters; + info!( + logger, + "Creating {} canisters via the seed canisters ...", + num_canisters_per_seed_canister * num_seed_canisters, + ); + let mut create_many_canisters_futs = vec![]; + for seed_canister_id in seed_canisters { + let bytes = Encode!(&num_canisters_per_seed_canister) + .expect("Failed to candid encode argument for a seed canister"); + let fut = agent + .update(&seed_canister_id, "create_many_canisters") + .with_arg(bytes) + .call_and_wait(); + create_many_canisters_futs.push(fut); + } + let res = join_all(create_many_canisters_futs).await; + for r in res { + r.expect("Failed to create canisters via a seed canister"); + } + + // We deploy 8 "busy" canisters: this way, + // there are 2 canisters per each of the 4 scheduler cores + // and thus every thread executes 2 x 1.8B = 3.6B instructions. + let num_busy_canisters = 8; + info!( + logger, + "Deploying {} busy canisters on a node {} ...", + num_busy_canisters, + agent_node.get_public_url() + ); + let mut create_busy_canisters_futs = vec![]; + for _ in 0..num_busy_canisters { + create_busy_canisters_futs.push(deploy_busy_canister( + &agent, + agent_node.effective_canister_id(), + &logger, + )); + } + join_all(create_busy_canisters_futs).await; + + info!( + logger, + "Killing a node: {} ...", + rejoin_node.get_public_url() + ); + rejoin_node.vm().kill(); + rejoin_node + .await_status_is_unavailable() + .expect("Node still healthy"); + + // Wait for the subnet to produce a CUP and then restart the rejoin_node. + // This way, the restarted node starts from that CUP + // and we can assert it to catch up until the next CUP. + info!(logger, "Waiting for a CUP ..."); + let agent_node_status = agent_node + .status_async() + .await + .expect("Failed to get status of agent_node"); + let latest_certified_height = agent_node_status + .certified_height + .expect("Failed to get certified height of agent_node") + .get(); + wait_for_cup(&logger, latest_certified_height, agent_node.clone()).await; + + info!(logger, "Start the killed node again ..."); + rejoin_node.vm().start(); + + info!(logger, "Waiting for the next CUP ..."); + let last_cup_height = wait_for_cup( + &logger, + latest_certified_height + dkg_interval + 1, + agent_node.clone(), + ) + .await; + + let rejoin_node_status = rejoin_node + .status_async() + .await + .expect("Failed to get status of rejoin_node"); + let rejoin_node_certified_height = rejoin_node_status + .certified_height + .expect("Failed to get certified height of rejoin_node") + .get(); + assert!( + rejoin_node_certified_height >= last_cup_height, + "The rejoin_node certified height {} is less than the last CUP height {}.", + rejoin_node_certified_height, + last_cup_height + ); + let rejoin_node_health_status = rejoin_node_status + .replica_health_status + .expect("Failed to get replica health status of rejoin_node"); + assert_eq!(rejoin_node_health_status, ReplicaHealthStatus::Healthy); +} + pub async fn assert_state_sync_has_happened( logger: &slog::Logger, rejoin_node: IcNodeSnapshot, diff --git a/rs/tests/message_routing/rejoin_test_long_rounds.rs b/rs/tests/message_routing/rejoin_test_long_rounds.rs new file mode 100644 index 000000000000..e0bf9678f915 --- /dev/null +++ b/rs/tests/message_routing/rejoin_test_long_rounds.rs @@ -0,0 +1,132 @@ +/* tag::catalog[] + +Title:: Nodes can rejoin a subnet with long DSM rounds + +Runbook:: +. setup the testnet of 3f + 1 nodes with f = 4 (like on mainnet) +. pick a random node and install 4 "seed" canisters through it (the state sync test canister is used as "seed") +. create 100,000 canisters via the "seed" canisters (in parallel) +. deploy 8 "busy" canisters (universal canister with heartbeats executing 1.8B instructions) +. pick another random node and kill that node +. wait for the subnet producing a CUP +. start the killed node + +Success:: +.. if the restarted node catches up w.r.t. its certified height and becomes healthy until the next CUP + +end::catalog[] */ + +use anyhow::Result; +use ic_registry_subnet_type::SubnetType; +use ic_system_test_driver::driver::farm::HostFeature; +use ic_system_test_driver::driver::group::SystemTestGroup; +use ic_system_test_driver::driver::ic::{ + AmountOfMemoryKiB, ImageSizeGiB, InternetComputer, NrOfVCPUs, Subnet, VmResources, +}; +use ic_system_test_driver::driver::pot_dsl::{PotSetupFn, SysTestFn}; +use ic_system_test_driver::driver::prometheus_vm::{HasPrometheus, PrometheusVm}; +use ic_system_test_driver::driver::test_env::TestEnv; +use ic_system_test_driver::driver::test_env_api::{ + HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer, +}; +use ic_system_test_driver::systest; +use ic_system_test_driver::util::{block_on, get_app_subnet_and_node}; +use ic_types::Height; +use rejoin_test_lib::rejoin_test_long_rounds; +use std::time::Duration; + +const PER_TASK_TIMEOUT: Duration = Duration::from_secs(3600); +const OVERALL_TIMEOUT: Duration = Duration::from_secs(3600); +const NUM_CANISTERS: usize = 100_000; + +const NUM_NODES: usize = 13; // mainnet value +const DKG_INTERVAL: u64 = 499; // mainnet value + +fn main() -> Result<()> { + let config = Config::new(NUM_NODES, NUM_CANISTERS); + let test = config.clone().test(); + SystemTestGroup::new() + .with_setup(config.build()) + .add_test(systest!(test)) + .with_timeout_per_test(PER_TASK_TIMEOUT) + .with_overall_timeout(OVERALL_TIMEOUT) + .execute_from_args()?; + Ok(()) +} + +#[derive(Clone, Debug)] +pub struct Config { + nodes_count: usize, + num_canisters: usize, +} + +impl Config { + pub fn new(nodes_count: usize, num_canisters: usize) -> Config { + Config { + nodes_count, + num_canisters, + } + } + + /// Builds the IC instance. + pub fn build(self) -> impl PotSetupFn { + move |env: TestEnv| setup(env, self) + } + + /// Returns a test function based on this configuration. + pub fn test(self) -> impl SysTestFn { + move |env: TestEnv| test(env, self) + } +} + +fn setup(env: TestEnv, config: Config) { + PrometheusVm::default() + .start(&env) + .expect("failed to start prometheus VM"); + + // VM resources are as for the "large" testnet. + let vm_resources = VmResources { + vcpus: Some(NrOfVCPUs::new(64)), + memory_kibibytes: Some(AmountOfMemoryKiB::new(480 << 20)), + boot_image_minimal_size_gibibytes: Some(ImageSizeGiB::new(2000)), + }; + InternetComputer::new() + .with_required_host_features(vec![HostFeature::Performance]) + .add_subnet( + Subnet::new(SubnetType::Application) + .with_default_vm_resources(vm_resources) + .with_dkg_interval_length(Height::from(DKG_INTERVAL)) + .add_nodes(config.nodes_count), + ) + .setup_and_start(&env) + .expect("failed to setup IC under test"); + + env.topology_snapshot().subnets().for_each(|subnet| { + subnet + .nodes() + .for_each(|node| node.await_status_is_healthy().unwrap()) + }); + + env.sync_with_prometheus(); +} + +fn test(env: TestEnv, config: Config) { + block_on(test_async(env, config)); +} + +async fn test_async(env: TestEnv, config: Config) { + let topology_snapshot = env.topology_snapshot(); + let (app_subnet, _) = get_app_subnet_and_node(&topology_snapshot); + + let mut nodes = app_subnet.nodes(); + let rejoin_node = nodes.next().unwrap(); + let agent_node = nodes.next().unwrap(); + rejoin_test_long_rounds( + env, + config.num_canisters, + DKG_INTERVAL, + rejoin_node.clone(), + agent_node.clone(), + ) + .await; +}