Skip to content

Commit b8d1cbc

Browse files
authored
fix(runtimed): improve kernel launch diagnostics and pool env validation (#1022)
- Don't add UV/conda envs to pool when warmup fails (ipykernel may not work) - Pool::take() checks for .warmed marker file as defense-in-depth - Capture kernel process stderr (errors at warn, rest at debug) - Early crash detection via try_wait() after 500ms startup delay - Race kernel_info_reply against process death via tokio::select! - Log kernel PID at spawn time for debugging - warmup_conda_env returns bool to gate pool addition
1 parent 447c4f7 commit b8d1cbc

File tree

2 files changed

+213
-71
lines changed

2 files changed

+213
-71
lines changed

crates/runtimed/src/daemon.rs

Lines changed: 135 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -194,13 +194,16 @@ impl Pool {
194194
fn take(&mut self) -> Option<PooledEnv> {
195195
self.prune_stale();
196196

197-
// Try to get a valid environment, skipping any with missing paths
197+
// Try to get a valid environment, skipping any with missing paths or missing warmup
198198
while let Some(entry) = self.available.pop_front() {
199-
if entry.env.venv_path.exists() && entry.env.python_path.exists() {
199+
if entry.env.venv_path.exists()
200+
&& entry.env.python_path.exists()
201+
&& entry.env.venv_path.join(".warmed").exists()
202+
{
200203
return Some(entry.env);
201204
}
202205
warn!(
203-
"[runtimed] Skipping env with missing path: {:?}",
206+
"[runtimed] Skipping env with missing path or warmup marker: {:?}",
204207
entry.env.venv_path
205208
);
206209
}
@@ -2245,36 +2248,51 @@ impl Daemon {
22452248
}
22462249

22472250
// Run warmup script
2248-
self.warmup_conda_env(&python_path, &env_path).await;
2249-
2250-
// Add to pool and check if we're clearing a previous error state
2251-
let had_errors = {
2252-
let mut pool = self.conda_pool.lock().await;
2253-
let had = pool.failure_state.consecutive_failures > 0;
2254-
pool.add(PooledEnv {
2255-
env_type: EnvType::Conda,
2256-
venv_path: env_path.clone(),
2257-
python_path,
2258-
});
2259-
had
2260-
};
2251+
let warmup_ok = self.warmup_conda_env(&python_path, &env_path).await;
22612252

2262-
info!(
2263-
"[runtimed] Conda environment ready: {:?} (pool: {}/{})",
2264-
env_path,
2265-
self.conda_pool.lock().await.stats().0,
2266-
self.config.conda_pool_size
2267-
);
2253+
if warmup_ok {
2254+
// Add to pool and check if we're clearing a previous error state
2255+
let had_errors = {
2256+
let mut pool = self.conda_pool.lock().await;
2257+
let had = pool.failure_state.consecutive_failures > 0;
2258+
pool.add(PooledEnv {
2259+
env_type: EnvType::Conda,
2260+
venv_path: env_path.clone(),
2261+
python_path,
2262+
});
2263+
had
2264+
};
2265+
2266+
info!(
2267+
"[runtimed] Conda environment ready: {:?} (pool: {}/{})",
2268+
env_path,
2269+
self.conda_pool.lock().await.stats().0,
2270+
self.config.conda_pool_size
2271+
);
22682272

2269-
// Broadcast cleared state if we recovered from errors
2270-
if had_errors {
2271-
info!("[runtimed] Conda pool recovered from error state");
2273+
// Broadcast cleared state if we recovered from errors
2274+
if had_errors {
2275+
info!("[runtimed] Conda pool recovered from error state");
2276+
self.broadcast_pool_state().await;
2277+
}
2278+
} else {
2279+
// Clean up failed env and record failure
2280+
let _ = tokio::fs::remove_dir_all(&env_path).await;
2281+
self.conda_pool
2282+
.lock()
2283+
.await
2284+
.warming_failed_with_error(Some(PackageInstallError {
2285+
failed_package: None,
2286+
error_message: "Conda warmup script failed (ipykernel may not be installed)"
2287+
.into(),
2288+
}));
22722289
self.broadcast_pool_state().await;
22732290
}
22742291
}
22752292

22762293
/// Warm up a conda environment by running Python to trigger .pyc compilation.
2277-
async fn warmup_conda_env(&self, python_path: &PathBuf, env_path: &PathBuf) {
2294+
/// Returns `true` if warmup succeeded (ipykernel imports work).
2295+
async fn warmup_conda_env(&self, python_path: &PathBuf, env_path: &PathBuf) -> bool {
22782296
let warmup_script = r#"
22792297
import ipykernel
22802298
import IPython
@@ -2302,20 +2320,24 @@ print("warmup complete")
23022320
// Create marker file
23032321
tokio::fs::write(env_path.join(".warmed"), "").await.ok();
23042322
info!("[runtimed] Conda warmup complete for {:?}", env_path);
2323+
true
23052324
}
23062325
Ok(Ok(output)) => {
23072326
let stderr = String::from_utf8_lossy(&output.stderr);
2308-
warn!(
2327+
error!(
23092328
"[runtimed] Conda warmup failed for {:?}: {}",
23102329
env_path,
23112330
stderr.lines().take(3).collect::<Vec<_>>().join(" | ")
23122331
);
2332+
false
23132333
}
23142334
Ok(Err(e)) => {
2315-
warn!("[runtimed] Failed to run conda warmup: {}", e);
2335+
error!("[runtimed] Failed to run conda warmup: {}", e);
2336+
false
23162337
}
23172338
Err(_) => {
2318-
warn!("[runtimed] Conda warmup timed out");
2339+
error!("[runtimed] Conda warmup timed out");
2340+
false
23192341
}
23202342
}
23212343
}
@@ -2573,36 +2595,60 @@ print("warmup complete")
25732595
)
25742596
.await;
25752597

2576-
match warmup_result {
2598+
let warmup_ok = match warmup_result {
25772599
Ok(Ok(output)) if output.status.success() => {
25782600
// Create marker file
25792601
tokio::fs::write(venv_path.join(".warmed"), "").await.ok();
2602+
true
25802603
}
2581-
Ok(_) => {
2582-
warn!("[runtimed] Warmup script failed, continuing anyway");
2604+
Ok(Ok(output)) => {
2605+
let stderr = String::from_utf8_lossy(&output.stderr);
2606+
error!(
2607+
"[runtimed] UV warmup failed, NOT adding to pool: {}",
2608+
stderr.lines().take(3).collect::<Vec<_>>().join(" | ")
2609+
);
2610+
false
2611+
}
2612+
Ok(Err(e)) => {
2613+
error!("[runtimed] Failed to run UV warmup: {}", e);
2614+
false
25832615
}
25842616
Err(_) => {
2585-
warn!("[runtimed] Warmup script timed out, continuing anyway");
2617+
error!("[runtimed] UV warmup timed out, NOT adding to pool");
2618+
false
25862619
}
2587-
}
2620+
};
25882621

2589-
info!("[runtimed] UV environment ready at {:?}", venv_path);
2622+
if warmup_ok {
2623+
info!("[runtimed] UV environment ready at {:?}", venv_path);
25902624

2591-
// Add to pool and check if we're clearing a previous error state
2592-
let had_errors = {
2593-
let mut pool = self.uv_pool.lock().await;
2594-
let had = pool.failure_state.consecutive_failures > 0;
2595-
pool.add(PooledEnv {
2596-
env_type: EnvType::Uv,
2597-
venv_path,
2598-
python_path,
2599-
});
2600-
had
2601-
};
2625+
// Add to pool and check if we're clearing a previous error state
2626+
let had_errors = {
2627+
let mut pool = self.uv_pool.lock().await;
2628+
let had = pool.failure_state.consecutive_failures > 0;
2629+
pool.add(PooledEnv {
2630+
env_type: EnvType::Uv,
2631+
venv_path,
2632+
python_path,
2633+
});
2634+
had
2635+
};
26022636

2603-
// Broadcast cleared state if we recovered from errors
2604-
if had_errors {
2605-
info!("[runtimed] UV pool recovered from error state");
2637+
// Broadcast cleared state if we recovered from errors
2638+
if had_errors {
2639+
info!("[runtimed] UV pool recovered from error state");
2640+
self.broadcast_pool_state().await;
2641+
}
2642+
} else {
2643+
// Clean up failed env and record failure
2644+
let _ = tokio::fs::remove_dir_all(&venv_path).await;
2645+
self.uv_pool
2646+
.lock()
2647+
.await
2648+
.warming_failed_with_error(Some(PackageInstallError {
2649+
failed_package: None,
2650+
error_message: "Warmup script failed (ipykernel may not be installed)".into(),
2651+
}));
26062652
self.broadcast_pool_state().await;
26072653
}
26082654
}
@@ -2630,6 +2676,9 @@ mod tests {
26302676
}
26312677
std::fs::write(&python_path, "").unwrap();
26322678

2679+
// Create warmup marker so take() accepts this env
2680+
std::fs::write(venv_path.join(".warmed"), "").unwrap();
2681+
26332682
PooledEnv {
26342683
env_type: EnvType::Uv,
26352684
venv_path,
@@ -2894,6 +2943,43 @@ mod tests {
28942943
assert!(err.message.contains("scitkit-learn"));
28952944
}
28962945

2946+
#[test]
2947+
fn test_pool_take_skips_unwarmed() {
2948+
let temp_dir = TempDir::new().unwrap();
2949+
let mut pool = Pool::new(3, 3600);
2950+
2951+
// Create an env with valid paths but NO .warmed marker
2952+
let venv_path = temp_dir.path().join("unwarmed-env");
2953+
std::fs::create_dir_all(&venv_path).unwrap();
2954+
#[cfg(windows)]
2955+
let python_path = venv_path.join("Scripts").join("python.exe");
2956+
#[cfg(not(windows))]
2957+
let python_path = venv_path.join("bin").join("python");
2958+
if let Some(parent) = python_path.parent() {
2959+
std::fs::create_dir_all(parent).unwrap();
2960+
}
2961+
std::fs::write(&python_path, "").unwrap();
2962+
2963+
let unwarmed_env = PooledEnv {
2964+
env_type: EnvType::Uv,
2965+
venv_path,
2966+
python_path,
2967+
};
2968+
pool.add(unwarmed_env);
2969+
2970+
// take() should skip the unwarmed env
2971+
assert!(pool.take().is_none());
2972+
2973+
// Add a properly warmed env
2974+
let warmed_env = create_test_env(&temp_dir, "warmed-env");
2975+
pool.add(warmed_env.clone());
2976+
2977+
// take() should return the warmed env
2978+
let taken = pool.take();
2979+
assert!(taken.is_some());
2980+
assert_eq!(taken.unwrap().venv_path, warmed_env.venv_path);
2981+
}
2982+
28972983
#[test]
28982984
fn test_parse_uv_error_package_not_found() {
28992985
let stderr = r#"error: No solution found when resolving dependencies:

0 commit comments

Comments
 (0)