Skip to content

Commit ad381d3

Browse files
committed
fix(ic-backup, CON-1035) Improve more divergence check in order to reduce flakiness
1 parent a1f503d commit ad381d3

File tree

2 files changed

+29
-16
lines changed

2 files changed

+29
-16
lines changed

rs/backup/src/backup_helper.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -901,7 +901,7 @@ fn last_dir_height(dir: &PathBuf, radix: u32) -> u64 {
901901
}
902902
}
903903

904-
fn last_checkpoint(dir: &Path) -> u64 {
904+
pub fn last_checkpoint(dir: &Path) -> u64 {
905905
last_dir_height(&dir.join("checkpoints"), 16)
906906
}
907907

rs/tests/src/orchestrator/backup_manager.rs

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ use crate::{
4242
},
4343
util::{block_on, get_nns_node},
4444
};
45-
use ic_backup::backup_helper::ls_path;
45+
use ic_backup::backup_helper::{last_checkpoint, ls_path};
4646
use ic_backup::config::{ColdStorage, Config, SubnetConfig};
4747
use ic_backup::util::sleep_secs;
4848
use ic_base_types::SubnetId;
@@ -77,7 +77,7 @@ pub fn config(env: TestEnv) {
7777
pub fn test(env: TestEnv) {
7878
let log = env.logger();
7979

80-
// Create all directories
80+
info!(log, "Create all directories");
8181
let root_dir = tempfile::TempDir::new()
8282
.expect("failed to create a temporary directory")
8383
.path()
@@ -91,6 +91,7 @@ pub fn test(env: TestEnv) {
9191
.path()
9292
.to_path_buf();
9393

94+
info!(log, "Fetch the replica version");
9495
let nns_node = get_nns_node(&env.topology_snapshot());
9596
let node_ip: IpAddr = nns_node.get_ip_addr();
9697
let subnet_id = env.topology_snapshot().root_subnet_id();
@@ -99,6 +100,10 @@ pub fn test(env: TestEnv) {
99100
let initial_replica_version = ReplicaVersion::try_from(replica_version.clone())
100101
.expect("Assigned replica version should be valid");
101102

103+
info!(
104+
log,
105+
"Copy the binaries needed for replay of the current version"
106+
);
102107
let backup_binaries_dir = backup_dir.join("binaries").join(&replica_version);
103108
fs::create_dir_all(&backup_binaries_dir).expect("failure creating backup binaries directory");
104109

@@ -109,6 +114,10 @@ pub fn test(env: TestEnv) {
109114
copy_file(&binaries_path, &backup_binaries_dir, "sandbox_launcher");
110115
copy_file(&binaries_path, &backup_binaries_dir, "canister_sandbox");
111116

117+
info!(
118+
log,
119+
"Download the binaries needed for replay of the mainnet version"
120+
);
112121
let mainnet_version = env
113122
.read_dependency_to_string("testnet/mainnet_nns_revision.txt")
114123
.expect("could not read mainnet version!");
@@ -130,6 +139,7 @@ pub fn test(env: TestEnv) {
130139
.expect("chmod command failed");
131140
chmod.wait_with_output().expect("chmod execution failed");
132141

142+
info!(log, "Run ECDSA signature test");
133143
let nns_node = env.get_first_healthy_nns_node_snapshot();
134144
let agent = nns_node.build_default_agent();
135145
let nns_canister = block_on(MessageCanister::new(
@@ -146,6 +156,7 @@ pub fn test(env: TestEnv) {
146156
);
147157
run_ecdsa_signature_test(&nns_canister, &log, key);
148158

159+
info!(log, "Install universal canister");
149160
let log2 = log.clone();
150161
let id = nns_node.effective_canister_id();
151162
let canister_id_hex: String = block_on({
@@ -155,13 +166,13 @@ pub fn test(env: TestEnv) {
155166
}
156167
});
157168

158-
// Update the registry with the backup key
169+
info!(log, "Update the registry with the backup key");
159170
let payload = get_updatesubnetpayload_with_keys(subnet_id, None, Some(vec![backup_public_key]));
160171
block_on(update_subnet_record(nns_node.get_public_url(), payload));
161172
let backup_mean = AuthMean::PrivateKey(backup_private_key);
162173
wait_until_authentication_is_granted(&node_ip, "backup", &backup_mean);
163174

164-
// Fetch NNS public key
175+
info!(log, "Fetch NNS public key");
165176
let nns_public_key = env
166177
.prep_dir("")
167178
.expect("missing NNS public key")
@@ -229,6 +240,7 @@ pub fn test(env: TestEnv) {
229240
&log,
230241
));
231242

243+
info!(log, "Wait for archived checkpoint");
232244
let archive_dir = backup_dir.join("archive").join(subnet_id.to_string());
233245
// make sure we have some archive of the old version before upgrading to the new one
234246
loop {
@@ -305,6 +317,7 @@ pub fn test(env: TestEnv) {
305317
info!(log, "Modify memory file: {:?}", memory_artifact_path);
306318
modify_byte_in_file(memory_artifact_path).expect("Modifying a byte failed");
307319

320+
info!(log, "Start again the backup process in a separate thread");
308321
let mut command = Command::new(&ic_backup_path);
309322
command
310323
.arg("--config-file")
@@ -317,10 +330,12 @@ pub fn test(env: TestEnv) {
317330
.expect("Failed to start backup process");
318331
info!(log, "Started process: {}", child.id());
319332

320-
assert!(cold_storage_exists(
321-
&log,
322-
cold_storage_dir.join(subnet_id.to_string())
323-
));
333+
if !cold_storage_exists(&log, cold_storage_dir.join(subnet_id.to_string())) {
334+
info!(log, "Kill child process");
335+
child.kill().expect("Error killing backup process");
336+
panic!("No cold storage");
337+
}
338+
324339
info!(log, "Artifacts and states are moved to cold storage");
325340

326341
let mut hash_mismatch = false;
@@ -365,17 +380,15 @@ fn some_checkpoint_dir(backup_dir: &Path, subnet_id: &SubnetId) -> Option<PathBu
365380
let dir = backup_dir
366381
.join("data")
367382
.join(subnet_id.to_string())
368-
.join("ic_state")
369-
.join("checkpoints");
383+
.join("ic_state");
370384
if !dir.exists() {
371385
return None;
372386
}
373-
if let Ok(mut cps) = fs::read_dir(dir) {
374-
if let Some(Ok(cp)) = cps.next() {
375-
return Some(cp.path());
376-
}
387+
let lcp = last_checkpoint(&dir);
388+
if lcp == 0 {
389+
return None;
377390
}
378-
None
391+
Some(dir.join(format!("checkpoints/{:016x}", lcp)))
379392
}
380393

381394
fn modify_byte_in_file(file_path: PathBuf) -> std::io::Result<()> {

0 commit comments

Comments
 (0)