Skip to content

Commit 6c56f8f

Browse files
🎨 add SIGBUS startup failure warning message (#34)
If we encounter a signal 7 or a SIGBUS signal error from the death of a shard, we then log a warning recommending to add shared memory. Signed-off-by: Prashant Gupta <[email protected]> Co-authored-by: PRASHANT GUPTA <[email protected]>
1 parent 1f4cfbe commit 6c56f8f

File tree

1 file changed

+25
-11
lines changed

1 file changed

+25
-11
lines changed

launcher/src/main.rs

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use nix::unistd::Pid;
44
use std::env;
55
use std::io::{BufRead, BufReader, ErrorKind, Write};
66
use std::path::Path;
7-
use std::process::{Command, ExitCode, Stdio};
7+
use std::process::{Command, ExitCode, ExitStatus, Stdio};
88
use std::sync::atomic::{AtomicBool, Ordering};
99
use std::sync::mpsc::TryRecvError;
1010
use std::sync::Arc;
@@ -16,7 +16,7 @@ use std::{fs, io};
1616
use std::env::VarError;
1717
use std::ffi::OsString;
1818
use std::fs::File;
19-
use std::os::unix::process::CommandExt;
19+
use std::os::unix::process::{CommandExt, ExitStatusExt};
2020
use tracing::{info, warn};
2121

2222
// In most cases this gives the best performance for inferencing
@@ -238,7 +238,7 @@ fn main() -> ExitCode {
238238
Err(TryRecvError::Empty) => {
239239
sleep(Duration::from_millis(100));
240240
}
241-
Ok(ShardStatus::Failed) => {
241+
Ok(ShardStatus::Failed(_status)) => {
242242
shutdown_shards(shutdown, shutdown_receiver);
243243
return ExitCode::FAILURE;
244244
}
@@ -347,9 +347,17 @@ fn main() -> ExitCode {
347347
let mut exit_code = ExitCode::SUCCESS;
348348

349349
while running.load(Ordering::SeqCst) {
350-
if let Ok(ShardStatus::Failed) = status_receiver.try_recv() {
350+
if let Ok(ShardStatus::Failed(status)) = status_receiver.try_recv() {
351351
exit_code = ExitCode::FAILURE;
352-
break;
352+
terminate_gracefully(&mut webserver, shutdown.clone(), shutdown_receiver);
353+
if status.signal() == Some(7) && num_shard > 1 {
354+
panic!(
355+
"Encountered SIGBUS error. This is usually caused by NCCL having insufficient shared memory. \
356+
Ensure at least 1GB of shared memory is available. In case of OpenShift/K8s, \
357+
mount a memory medium emptyDir volume to /dev/shm"
358+
)
359+
}
360+
return exit_code
353361
};
354362

355363
match webserver.try_wait().expect("Error polling status of router process") {
@@ -362,17 +370,21 @@ fn main() -> ExitCode {
362370
};
363371
}
364372

365-
// Graceful termination
373+
terminate_gracefully(&mut webserver, shutdown.clone(), shutdown_receiver);
374+
375+
exit_code
376+
}
377+
378+
/// Graceful termination
379+
fn terminate_gracefully(webserver: &mut std::process::Child, shutdown: Arc<Mutex<bool>>, shutdown_receiver: &mpsc::Receiver<()>) {
366380
signal::kill(Pid::from_raw(webserver.id() as i32), Signal::SIGTERM).unwrap();
367381
info!("Waiting for router to gracefully shutdown");
368382
webserver.wait().unwrap();
369383
info!("Router terminated");
370384
shutdown_shards(shutdown, &shutdown_receiver);
371385

372-
exit_code
373386
}
374387

375-
376388
fn num_cuda_devices() -> Option<usize> {
377389
let devices = match env::var("CUDA_VISIBLE_DEVICES") {
378390
Ok(devices) => devices,
@@ -481,7 +493,7 @@ fn find_num_shards(num_shard: Option<usize>) -> usize {
481493
#[derive(Debug)]
482494
enum ShardStatus {
483495
Ready,
484-
Failed,
496+
Failed(ExitStatus),
485497
}
486498

487499
#[allow(clippy::too_many_arguments)]
@@ -619,7 +631,7 @@ fn shard_manager(
619631
} else {
620632
tracing::error!("Shard {rank} failed to start:\n{err}");
621633
}
622-
status_sender.send(ShardStatus::Failed).unwrap();
634+
status_sender.send(ShardStatus::Failed(ExitStatus::from_raw(0))).unwrap();
623635
return
624636
}
625637
};
@@ -654,7 +666,9 @@ fn shard_manager(
654666
io::stdout().flush().unwrap_or_default();
655667
stderr_thread.join().unwrap_or_default();
656668
io::stderr().flush().unwrap_or_default();
657-
status_sender.send(ShardStatus::Failed).unwrap();
669+
status_sender
670+
.send(ShardStatus::Failed(status))
671+
.unwrap();
658672
}
659673
return
660674
}

0 commit comments

Comments
 (0)