Skip to content

Commit 209ac74

Browse files
raymondklwshangclaude
authored
fix: Network fails to start if a stale network descriptor is present (#466)
* fix: Network fails to start if a stale network descriptor is present * changelog * fmt * fix: simplify stale descriptor cleanup and fix ordering Clean port descriptor before project descriptor so the global resource is freed first. Pass gateway_port() directly since cleanup_port_descriptor already handles None. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Linwei Shang <linwei.shang@dfinity.org> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 02b7c5a commit 209ac74

File tree

4 files changed

+95
-8
lines changed

4 files changed

+95
-8
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Unreleased
22

33
* feat: Many more commands support `--json` and `--quiet`.
4+
* fix: Network would fail to start if a stale descriptor was present
45

56
# v0.2.1
67

crates/icp-cli/src/commands/network/start.rs

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use icp::{
1515
},
1616
settings::Settings,
1717
};
18-
use tracing::{debug, info};
18+
use tracing::{debug, info, warn};
1919

2020
use super::args::NetworkOrEnvironmentArgs;
2121
use icp::context::Context;
@@ -78,8 +78,23 @@ pub(crate) async fn exec(ctx: &Context, args: &StartArgs) -> Result<(), anyhow::
7878
nd.ensure_exists()
7979
.context("failed to create network directory")?;
8080

81-
if nd.load_network_descriptor().await?.is_some() {
82-
bail!("network '{}' is already running", network.name);
81+
if let Some(descriptor) = nd.load_network_descriptor().await? {
82+
debug!(
83+
"Found network descriptor for {} in: {}",
84+
nd.network_name, nd.network_root
85+
);
86+
if descriptor.child_locator.is_alive().await {
87+
bail!("network '{}' is already running", network.name);
88+
} else {
89+
warn!(
90+
"Found stale network descriptor for '{}' (process is no longer running). \
91+
Cleaning up and starting fresh.",
92+
network.name
93+
);
94+
nd.cleanup_port_descriptor(descriptor.gateway_port())
95+
.await?;
96+
nd.cleanup_project_network_descriptor().await?;
97+
}
8398
}
8499

85100
// Clean up any existing canister ID mappings of which environment is on this network

crates/icp-cli/tests/canister_install_tests.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ async fn canister_install_with_environment_settings_override() {
479479
- type: script
480480
command: cp '{wasm}' "$ICP_WASM_OUTPUT_PATH"
481481
settings:
482-
memory_allocation: 1073741824
482+
memory_allocation: 10485760
483483
484484
{NETWORK_RANDOM_PORT}
485485
@@ -488,7 +488,7 @@ async fn canister_install_with_environment_settings_override() {
488488
network: random-network
489489
settings:
490490
my-canister:
491-
memory_allocation: 2147483648
491+
memory_allocation: 20971520
492492
"#};
493493

494494
write_string(&project_dir.join("icp.yaml"), &pm).expect("failed to write project manifest");
@@ -497,7 +497,7 @@ async fn canister_install_with_environment_settings_override() {
497497
let _g = ctx.start_network_in(&project_dir, "random-network").await;
498498
ctx.ping_until_healthy(&project_dir, "random-network");
499499

500-
// Deploy should use the environment override (memory_allocation: 2GB)
500+
// Deploy should use the environment override (memory_allocation: 20MiB)
501501
clients::icp(&ctx, &project_dir, Some("random-environment".to_string()))
502502
.mint_cycles(10 * TRILLION);
503503

@@ -532,8 +532,8 @@ async fn canister_install_with_environment_settings_override() {
532532

533533
let output_str = String::from_utf8_lossy(&output);
534534
assert!(
535-
output_str.contains("Memory allocation: 2_147_483_648"),
536-
"Expected memory_allocation to be 2_147_483_648 (2GB) from environment override, got: {}",
535+
output_str.contains("Memory allocation: 20_971_520"),
536+
"Expected memory_allocation to be 20_971_520 (20MiB) from environment override, got: {}",
537537
output_str
538538
);
539539
}

crates/icp-cli/tests/network_tests.rs

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -874,3 +874,74 @@ async fn network_gateway_binds_to_configured_interface() {
874874
resp.status()
875875
);
876876
}
877+
878+
#[tokio::test]
879+
async fn network_recovers_from_stale_descriptor() {
880+
let ctx = TestContext::new();
881+
let project_dir = ctx.create_project_dir("stale-descriptor");
882+
883+
// Project manifest
884+
write_string(&project_dir.join("icp.yaml"), NETWORK_RANDOM_PORT)
885+
.expect("failed to write project manifest");
886+
887+
// Ensure the network descriptor directory exists
888+
let network_dir = project_dir.join(".icp/cache/networks/random-network");
889+
std::fs::create_dir_all(&network_dir).expect("failed to create network directory");
890+
891+
// Create a stale descriptor with a PID that cannot exist
892+
let stale_descriptor = serde_json::json!({
893+
"v": "1",
894+
"id": "11111111-1111-1111-1111-111111111111",
895+
"project-dir": project_dir.to_string(),
896+
"network": "random-network",
897+
"network-dir": network_dir.to_string(),
898+
"gateway": {
899+
"fixed": false,
900+
"port": 9999,
901+
"host": "localhost",
902+
"ip": "127.0.0.1"
903+
},
904+
"child-locator": {
905+
"type": "pid",
906+
"pid": u32::MAX, // Non-existent PID
907+
"start-time": 0
908+
},
909+
"root-key": "308182301c300d06092a864886f70d0101010500030b008081007f",
910+
"pocketic-config-port": null,
911+
"pocketic-instance-id": null,
912+
"candid-ui-canister-id": null,
913+
"proxy-canister-id": null,
914+
"status-dir": null,
915+
"use-friendly-domains": false
916+
});
917+
918+
// Write the stale descriptor
919+
let descriptor_bytes =
920+
serde_json::to_vec(&stale_descriptor).expect("failed to serialize descriptor");
921+
ctx.write_network_descriptor(&project_dir, "random-network", &descriptor_bytes);
922+
923+
// Start network - should succeed and clean up the stale descriptor
924+
ctx.icp()
925+
.current_dir(&project_dir)
926+
.args(["network", "start", "random-network", "--background"])
927+
.assert()
928+
.success()
929+
.stderr(contains("Found stale network descriptor"));
930+
931+
// Verify the network actually started (descriptor should be updated with real process)
932+
let network = ctx.wait_for_network_descriptor(&project_dir, "random-network");
933+
934+
ctx.ping_until_healthy(&project_dir, "random-network");
935+
936+
// Verify we can query the network
937+
let agent = ic_agent::Agent::builder()
938+
.with_url(format!("http://127.0.0.1:{}", network.gateway_port))
939+
.build()
940+
.expect("Failed to build agent");
941+
942+
let status = agent.status().await.expect("Failed to get network status");
943+
assert!(
944+
matches!(&status.replica_health_status, Some(health) if health == "healthy"),
945+
"Network should be healthy"
946+
);
947+
}

0 commit comments

Comments
 (0)